TEST 123 testing

h1b_2018 = read.csv(file = "./data/dataset.csv")  
## this is a test to read and open the dataset
## I created a folder in our project called "data" 
## then inside it I downloaded the dataset and called it "dataset.csv" 



### this selects the 16 variables we are interested in
h1b_18_filter = h1b_2018 %>% 
  janitor::clean_names() %>% 
  select(case_number, visa_class, case_status, employer_name, employer_country, employer_city, employer_state,
         naics_code, soc_code, soc_name, total_workers, employment_start_date, employment_end_date, full_time_position,
         prevailing_wage, pw_unit_of_pay, worksite_city, worksite_state, h1b_dependent) %>%
  filter(case_status == "CERTIFIED",
          visa_class == "H-1B",
          employer_country == "UNITED STATES OF AMERICA") %>% 
  select(-visa_class, -case_status, -employer_country)
library(ggplot2)
library(plotly)
h1b_18_filter %>%
  group_by(employer_state) %>%
  summarize(state_n = n())
## # A tibble: 55 x 2
##    employer_state state_n
##    <fct>            <int>
##  1 AK                  74
##  2 AL                1177
##  3 AR                2531
##  4 AZ                3842
##  5 CA               95700
##  6 CO                3208
##  7 CT                5142
##  8 DC                1706
##  9 DE                2370
## 10 FL               16738
## # ... with 45 more rows
h1b_location =
  h1b_18_filter %>% 
  select(case_number, employer_city, employer_state, worksite_city, worksite_state) 

h1b_employer_50 = h1b_location %>% 
  filter(!employer_state %in% c("DC", "GU", "MP", "PR", "VI")) %>% 
  group_by(employer_state) %>% 
  summarize(employer_number = n()) %>% 
  rename("state" = "employer_state")

h1b_worksite_50 = h1b_location %>% 
  filter(!worksite_state %in% c("DC", "GU", "MH",  "MP", "PR", "VI")) %>% 
  group_by(worksite_state) %>% 
  summarize(worksite_number = n()) %>% 
  rename("state" = "worksite_state")


h1b_location_data_big = 
  full_join(h1b_employer_50, h1b_worksite_50, by = "state") %>% 
  filter(worksite_number >= 2000 | employer_number >= 2000) 
## Warning: Column `state` joining factors with different levels, coercing to
## character vector
h1b_location_data_small =
  full_join(h1b_employer_50, h1b_worksite_50, by = "state") %>% 
  filter(worksite_number < 2000 & employer_number < 2000) %>% 
  mutate(employer_number = sum(employer_number),
         worksite_number = sum(worksite_number)) %>% 
  mutate(state = "Others") %>% 
  select(state, employer_number, worksite_number) 
## Warning: Column `state` joining factors with different levels, coercing to
## character vector
others = unique(h1b_location_data_small) 

h1b_location_data = 
  bind_rows(h1b_location_data_big , others)
  

h1b_location_data %>% 
  mutate(state = fct_reorder(state, worksite_number)) %>% 
  plot_ly(x = ~state, y = ~employer_number, type = 'bar', name = 'headquater location') %>% 
  add_trace(y = ~worksite_number, name = 'work location') %>%
  layout(yaxis = list(title = 'Count'), barmode = 'group', legend = list(x = 0.1, y = 0.9))

Map showing number of H1b in worksite state

state_ws = 
  h1b_2018 %>%
  group_by(worksite_state) %>%
  summarize(n = n())
state_ws$hover <- with(state_ws, paste("State", worksite_state, "<br>", "Number", n))
g2 <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white')
)
map_2 <- plot_geo(state_ws, locationmode = 'USA-states') %>%
  add_trace(
    z = ~n, text = ~hover, locations = ~worksite_state,
    color = ~n, colors = 'Greens'
  ) %>%
  colorbar(title = "H1b Case Number") %>%
  layout(
    title = '2018 US H1b by worksite State',
    geo = g2
  )
map_2

Map showing number of H1b in worksite state

state_ws = 
  h1b_2018 %>%
  group_by(worksite_state) %>%
  summarize(n = n())
state_ws$hover <- with(state_ws, paste("State", worksite_state, "<br>", "Number", n))
g2 <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white')
)
map_2 <- plot_geo(state_ws, locationmode = 'USA-states') %>%
  add_trace(
    z = ~n, text = ~hover, locations = ~worksite_state,
    color = ~n, colors = 'Greens'
  ) %>%
  colorbar(title = "H1b Case Number") %>%
  layout(
    title = '2018 US H1b by worksite State',
    geo = g2
  )
map_2

Map showing number of H1b in employer state

state_number = 
  h1b_2018 %>%
  group_by(employer_state) %>%
  summarize(n = n())

state_number$hover <- with(state_number, paste("State", employer_state, "<br>", "Number", n))

l <- list(color = toRGB("white"), width = 2)

g <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white')
)

p <- plot_geo(state_number, locationmode = 'USA-states') %>%
  add_trace(
    z = ~n, text = ~hover, locations = ~employer_state,
    color = ~n, colors = 'Purples'
  ) %>%
  colorbar(title = "Millions USD") %>%
  layout(
    title = '2018 US H1b by State',
    geo = g
  )

p

Graph showing top 10 cities with H1b cases in California state (worksite)

city_ca = 
  h1b_2018 %>%
  filter(worksite_state == 'CA') %>% 
  group_by(worksite_city) %>%
  summarize(n = n()) %>% 
  filter(min_rank(desc(n)) <= 10)
city_ca %>% 
  mutate(worksite_city = fct_reorder(worksite_city, n)) %>% 
  plot_ly(x = ~worksite_city, y = ~n, color = ~worksite_city, type = "bar")
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

Companies that have the most H1B Visa Workers

#Listed the top 15 companies that the hires the most H1B applicants. 
h1b_soc = h1b_18_filter %>% 
    select(case_number, employer_name, naics_code, soc_code, soc_name, total_workers, employment_start_date, employment_end_date, full_time_position)

  
h1b_soc_con = h1b_soc %>% 
  select(employer_name, total_workers) %>% 
  group_by(employer_name) %>% 
  summarize(total = n()) %>% 
  arrange(desc(total)) %>% 
  top_n(15) %>% 
  select(employer_name, total) %>% 
  mutate(employer_name = as.factor(employer_name)) %>% 
  mutate(employer_name = fct_reorder(employer_name, total))
## Selecting by total
#h1b_soc_con %>% 
  #ggplot(aes(x = reorder(employer_name, total), y = total)) +
  #geom_bar(stat = 'identity') +
  #coord_flip() +
  #labs(x = 'Companies', y = 'Total Workers', 
       #title = 'Top 15 companies who has the most H1B Visa Workers')


h1b_top_plotly = h1b_soc_con %>% 
  plot_ly(x = ~employer_name, y = ~total, color = ~employer_name, type = 'bar', colors = "Set2") %>% 
        layout(title = 'Top 15 companies who has the most H1B Visa Workers',
           xaxis = list(title = 'Companies'),
           yaxis = list(title = 'Total Workers'), 
           showlegend = FALSE)

h1b_top_plotly

Regrouping SOC codes broadly - use bls website as source

#The soc code were too specific. The bls website shows that the first two numbers of the soc code categorizes the industry more broadly. Therefore, case_when was used to make a new soc_code variable. Collapsed most of the industries to others for a reasonable graph and table. 
h1b_soc_ind = h1b_soc %>%
  select(case_number, soc_code) %>% 
  mutate(soc_code_new = str_remove_all(soc_code, '-'),
         soc_code_new = as.integer(soc_code_new)) %>% 
  na.omit() %>% 
  mutate(soc_code_gen = case_when(
    soc_code_new >= 110000 & soc_code_new < 120000 ~ "Management",
    soc_code_new >= 130000 & soc_code_new < 140000 ~"Business, Finance",
    soc_code_new >= 150000 & soc_code_new < 160000 ~ "Computer, Math",
    soc_code_new >= 170000 & soc_code_new < 180000 ~ "Architecture, Engineer",
    soc_code_new >= 190000 & soc_code_new < 200000 ~ "Life, Physcial, Social Science",
    soc_code_new >= 210000 & soc_code_new < 220000 ~ "Social Service",
    soc_code_new >= 230000 & soc_code_new < 240000 ~ "Legal",
    soc_code_new >= 250000 & soc_code_new < 260000 ~ "Education, Training",
    soc_code_new >= 270000 & soc_code_new < 280000 ~ "Media, Design",
    soc_code_new >= 290000 & soc_code_new < 300000 ~ "Healthcare Practitioner",
    soc_code_new >= 410000 & soc_code_new < 420000 ~ "Sales",
    TRUE ~ "Others")) %>% 
  mutate(soc_code_gen = as.factor(soc_code_gen))
## Warning: NAs introduced by coercion

Certified H1B applicants and which industries they are in

#Using the newly coded industry variable, checked to see which sector is the most popular with the H1B applicants. 
h1b_soc_small = h1b_soc_ind %>%
  group_by(soc_code_gen) %>%
  summarize(num_ind = n()) %>% 
  arrange(desc(num_ind)) %>% 
  mutate(soc_code_gen = fct_reorder(soc_code_gen, num_ind))
  
#h1b_soc_plot = h1b_soc_small %>% 
  #ggplot(aes(x = reorder(soc_code_gen, num_ind), y = num_ind)) +
  #geom_bar(stat = 'identity') +
  #coord_flip() +
  #labs(x = 'Industries', y = 'Total Workers', 
       #title = 'Industries and number of certified H1B appliants')

h1b_soc_plot = plot_ly(
    h1b_soc_small, x = ~soc_code_gen, y = ~num_ind, color = ~soc_code_gen, type = "bar", colors = "Set1"
    ) %>% 
        layout(title = 'Industries and number of certified H1B applicants',
           xaxis = list(title = 'Industries'),
           yaxis = list(title = 'Total Workers'), 
           showlegend = FALSE)

h1b_soc_plot

Length of time employees were contracted.

#Checking to see how long employees are contracted for. Seems like most are contracted for 3 years. There's a minor number of people who are contacted for 2 years or 1 year. 
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
h1b_soc_date <- h1b_soc %>%
  select(employment_start_date, employment_end_date) %>%
  mutate(employment_start_date = as.Date(employment_start_date),
         employment_end_date = as.Date(employment_end_date),
         year_start = year(employment_start_date),
         year_end = year(employment_end_date),
         length_time = year_end - year_start, 
         length_time= as.factor(length_time))
date_table <- h1b_soc_date %>%
  group_by(length_time) %>%
  summarize(num_year = n())
## Warning: Factor `length_time` contains implicit NA, consider using
## `forcats::fct_explicit_na`
ggplot(data = h1b_soc_date, aes(x = length_time)) +
  geom_bar()

#Checking how the unit of pay actually looks like, before making the new wage variable.
money = h1b_18_filter %>% 
  select(prevailing_wage, pw_unit_of_pay) 
summary(money)
##  prevailing_wage      pw_unit_of_pay  
##  Min.   :     7.2            :     0  
##  1st Qu.: 65728.0   Bi-Weekly:    29  
##  Median : 80974.0   Hour     : 37594  
##  Mean   : 81070.4   Month    :   198  
##  3rd Qu.: 97323.0   Week     :    66  
##  Max.   :456610.0   Year     :529092

Converting wage to the same units

## this creates a new column for the adjusted wage per year 
h1b_18_adjusted_wage = 
  h1b_18_filter %>% 
  mutate(pw_unit_of_pay = str_to_lower(pw_unit_of_pay), 
         adjusted_wage_per_year = prevailing_wage)  

## we know that there are 5 levels for pw_unit_of_pay
## year, hour, week, month, bi-weekly 

## this converts all pay wages to yearly 
h1b_18_adjusted_wage = 
  h1b_18_adjusted_wage %>%
##select(pw_unit_of_pay, adjusted_wage_per_year, prevailing_wage) %>% 
  mutate(adjusted_wage_per_year = 
           if_else(pw_unit_of_pay == "hour", prevailing_wage * 40 * 52, 
                   if_else(pw_unit_of_pay == "week", prevailing_wage * 52, 
                           if_else(pw_unit_of_pay == "month", prevailing_wage * 12, 
                                   if_else(pw_unit_of_pay == "bi-weekly", prevailing_wage * 26, 
                                           prevailing_wage)))))

Annual Wage by industries

# The new annual wage variable and the industries tables are not in the same table, so inner joined by case_number to form one table with both variables.
h1b_soc_wage = h1b_18_adjusted_wage %>% 
  select(case_number, adjusted_wage_per_year) %>% 
  inner_join(h1b_soc_ind) %>% 
  select(soc_code_gen, adjusted_wage_per_year)%>% 
  arrange (desc(adjusted_wage_per_year)) %>% 
  mutate(soc_code_gen = fct_reorder(soc_code_gen, adjusted_wage_per_year))
## Joining, by = "case_number"
h1b_wage_plotly = plot_ly(
    h1b_soc_wage, x = ~soc_code_gen, y = ~adjusted_wage_per_year, color = ~soc_code_gen, type = "box", colors = "Set2"
    ) %>% 
        layout(title = 'Industries and wage distribution of certified H1B applicants',
           xaxis = list(title = 'Industries'),
           yaxis = list(title = 'Annual Wage'), 
           showlegend = FALSE)

h1b_wage_plotly

Average Wage by industries

##Wasn't sure if wage distribution or avg wage was more informative, so decided to do both and see which other group members like best. 
h1b_avg_wage = h1b_soc_wage %>% 
  group_by(soc_code_gen) %>% 
  summarize(avg_wage_ind = mean(adjusted_wage_per_year)) %>% 
  arrange (desc(avg_wage_ind)) %>% 
  mutate(soc_code_gen = fct_reorder(soc_code_gen, avg_wage_ind))


h1b_avgwage_plot = h1b_avg_wage%>% 
  plot_ly(x = ~soc_code_gen, y = ~avg_wage_ind, color = ~soc_code_gen, type = "bar", colors = "Set2"
    ) %>% 
        layout(title = 'Industries and average wage of certified H1B applicants',
           xaxis = list(title = 'Industries'),
           yaxis = list(title = 'Average Wage'), 
           showlegend = FALSE)

h1b_avgwage_plot